Load merged Summarized Experiment data and pre-processing
se=readRDS(file = paste0(MEDIAsave,"Mydata_EGAD00001001331_merged.rds"))
colData(se)$Donor=factor(colData(se)$Donor)
colData(se)$Status=factor(colData(se)$Status)
keep <- rowSums(assay(se)>=15) >= 6 #at least 50% of the samples
se1 <- se[keep,]
tmp <-assay(se1)
tmp <-cbind(gene_id=rownames(tmp),tmp)
tmp <- merge(tmp,ens2gene,by.y="gene_id")
Fix duplicates: mean expression value across transcripts has been
used for each duplicate gene
dupp <-tmp[duplicated(tmp$gene_name),]
gn_dup <-unique(dupp$gene_name)
tmp1 <-tmp[!tmp$gene_name %in% gn_dup,]
counts2 <-as.data.frame(matrix(NA, length(gn_dup),ncol(tmp)))
colnames(counts2) <-colnames(tmp)
rownames(counts2) <-paste0("ENS_",1:length(gn_dup))
counts2 <-counts2[,-c(1,26)]
for(i in 1:length(gn_dup)){
x=gn_dup[i]
y=tmp[tmp$gene_name %in% x,-c(1,26)]
counts2[i,1:ncol(counts2)] <-apply(y,2,function(x) round(mean(as.numeric(x))))
}
counts2 <-cbind(gene_id=rownames(counts2),counts2,gene_name=gn_dup)
tmp3 <-rbind(tmp1,as.matrix(counts2))
dataDds <-tmp3
ccounts <-tmp3[,2:(ncol(tmp3)-1)]
ccounts <-t(apply(ccounts,1,as.numeric))
rownames(ccounts) <-tmp3$gene_id
colnames(ccounts) <-colnames(tmp3[,2:(ncol(tmp3)-1)])
dds <- DESeqDataSetFromMatrix(countData = ccounts,
colData = colData(se),
design = ~ Donor + Status)
dds$Status <- factor(dds$Status, levels = c("Normal","Osteoarthritis"))
dds$Status <- relevel(dds$Status, ref = "Normal")
keep <- rowSums(counts(dds)>=15) >= 6 #re-check low counts genes
dds <- dds[keep,]
cat("Merged data Dimension after filtering and pre-processing: ", dim(dds))
## Merged data Dimension after filtering and pre-processing: 18718 24
Principal component analysis
vsd <- vst(dds, blind=FALSE)
pcaData <- plotPCA(vsd, intgroup=c("Donor", "Status"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=Donor, shape=Status,label = rownames(pcaData))) +
geom_point(size=3,alpha=1) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()+
ggtitle("All samples after merging replicate Runs")+
theme_bw()

Batch correction
assay(vsd)<-limma::removeBatchEffect(assay(vsd), vsd$Donor)
pcaData <- plotPCA(vsd, intgroup=c("Donor", "Status"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=Donor, shape=Status,label = rownames(pcaData))) +
geom_point(size=3,alpha=1) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()+
ggtitle("All samples after merging replicate Runs")+
theme_bw()

Differential expression analysis using DESeq2
dds <- DESeq(dds)
resultsNames(dds)
## [1] "Intercept" "Donor_SMBP051_vs_SMBP049"
## [3] "Donor_SMBP053_vs_SMBP049" "Donor_SMBP054_vs_SMBP049"
## [5] "Donor_SMBP055_vs_SMBP049" "Donor_SMBP056_vs_SMBP049"
## [7] "Donor_SMBP059_vs_SMBP049" "Donor_SMBP060_vs_SMBP049"
## [9] "Donor_SMBP061_vs_SMBP049" "Donor_SMBP064_vs_SMBP049"
## [11] "Donor_SMBP065_vs_SMBP049" "Donor_SMBP066_vs_SMBP049"
## [13] "Status_Osteoarthritis_vs_Normal"
df.res <- as.data.frame(results(dds,name="Status_Osteoarthritis_vs_Normal" ,pAdjustMethod = "fdr"))
summary(df.res)
## baseMean log2FoldChange lfcSE stat
## Min. : 7 Min. :-2.053092 Min. :0.03245 Min. :-5.818743
## 1st Qu.: 64 1st Qu.:-0.115682 1st Qu.:0.10165 1st Qu.:-0.903174
## Median : 328 Median : 0.001204 Median :0.13893 Median : 0.009566
## Mean : 2049 Mean : 0.038294 Mean :0.16419 Mean : 0.080029
## 3rd Qu.: 1142 3rd Qu.: 0.141979 3rd Qu.:0.20483 3rd Qu.: 0.986378
## Max. :3442898 Max. : 2.255348 Max. :0.97061 Max. : 6.853373
## pvalue padj
## Min. :0.0000 Min. :0.0000001
## 1st Qu.:0.1183 1st Qu.:0.4727404
## Median :0.3450 Median :0.6900214
## Mean :0.3972 Mean :0.6501577
## 3rd Qu.:0.6522 3rd Qu.:0.8696312
## Max. :0.9999 Max. :0.9998726
df.res$ens <-rownames(df.res)
dataDds <-dataDds[,c(1,26)]
colnames(dataDds) <-c("ens","gene_id")
df.res<-merge(dataDds,df.res,by.x="ens")
df.res <-df.res[,c(1,3:8,2)]
colnames(df.res)[8] <-"hgnc_symbol"
rownames(df.res) <-df.res$ens
rm(dataDds,tmp3)
Save files
save(df.res, file=paste0(MEDIAsave,"results_DE_EGApaired.RData"))
Volcano Plot of DE genes
annot <-df.res[,c(3,7,8)]
annot <-annot[complete.cases(annot),]
annot$col <-"gray50"
annot[annot$log2FoldChange <= -1 & annot$padj <=0.05, "col"] <- "springgreen3"
annot[annot$log2FoldChange >= 1 & annot$padj <=0.05, "col"] <- "red"
annot$label <- NA
annot[annot$log2FoldChange >= 1.5 & annot$padj <=0.05, "label"] <- annot[annot$log2FoldChange >= 1.5 & annot$padj <=0.05, "hgnc_symbol"]
annot[annot$log2FoldChange <= -1.5 & annot$padj <=0.05, "label"] <- annot[annot$log2FoldChange <= -1.5 & annot$padj <=0.05, "hgnc_symbol"]
ggplot(data=annot, aes(x=log2FoldChange, y=-log10(padj), color=col,label=label)) +
geom_point(size=1) +
geom_label_repel(size=4,
min.segment.length = 0,direction = "both",
max.overlaps =40,
color="black",
segment.linetype=2) +
scale_shape_manual(values=c(8, 19))+
scale_color_manual(values = c("red","gray50","springgreen3"),
breaks = c("red","gray50","springgreen3"),
labels = c("UP","notDE","DOWN")) +
labs(color="Legend")+
xlab("log2(FC)")+
ylab("-log10(FDR)")+
theme_bw()+
ggtitle("Volcano plot of DE genes - Dataset 2")+
geom_vline(xintercept=c(-1, 1), col="gray", linetype=2)+
geom_hline(yintercept=-log10(0.05), col="gray", linetype=2)

Enrichment with GSEA: BP and REACTOME
hs=get("org.Hs.eg.db")
genes1 <- df.res[, c("log2FoldChange","hgnc_symbol")]
genes <- genes1$log2FoldChange
names(genes) <-genes1$hgnc_symbol
genes <-sort(genes, decreasing = TRUE)
gsebp <- gseGO(geneList=genes,
ont ="BP",
keyType = "SYMBOL",
pvalueCutoff = 0.01,
eps = 0,
verbose = TRUE,
OrgDb = hs,
pAdjustMethod = "fdr")
dotplot(gsebp, showCategory = 10, x="NES",split=".sign") + facet_grid(.~.sign)

rc <-msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:REACTOME")
sig <-rc[,c(3,4)]
sig$gs_name <-gsub("^REACTOME_","",sig$gs_name)
gsreac <- clusterProfiler::GSEA(genes,TERM2GENE = sig,
eps = 0,
verbose = TRUE,
minGSSize =5, pAdjustMethod = "fdr",
pvalueCutoff =0.1)
dotplot(gsreac, showCategory = 10, x="NES",split=".sign") + facet_grid(.~.sign)+
theme(axis.text.y = element_text(size = 7))

Load data and pre-processing
load(paste0(wdd,"data/txi.RData"))
patientDetails<-read.csv(paste0(wdd,"data/patientDetails_all_withMed.csv"),stringsAsFactors = F)
txi$counts <- txi$counts[,match(unique(as.character(patientDetails$name)),colnames(txi$counts))]
txi$abundance <- txi$abundance[,match(unique(as.character(patientDetails$name)),colnames(txi$abundance))]
txi$length <- txi$length[,match(unique(as.character(patientDetails$name)),colnames(txi$length))]
patientDetails <-patientDetails[,-3]
patientDetails<-patientDetails[!duplicated(patientDetails$name),]
patientDetails$OA<-ifelse(grepl("SH0", patientDetails$name),"OA","NonOA")
Batches<-as.factor(patientDetails$batch)
Disease<- as.factor(patientDetails$OA)
colData<-data.frame(Batches=Batches,Disease)
dds<-DESeqDataSetFromTximport(txi,colData,~Batches+Disease)
filter <- rowSums(counts(dds)>=5) >= 35 #at least 50% of the samples
dds <- dds[filter,]
tmp <-counts(dds)
tmp <-cbind(gene_id=rownames(tmp),tmp)
tmp <-merge(tmp,ens2gene,by.y="gene_id")
Fix duplicates: mean expression value across transcripts has been
used for each duplicate gene
dupp <-tmp[duplicated(tmp$gene_name),]
gn_dup <-unique(dupp$gene_name)
tmp1 <-tmp[!tmp$gene_name %in% gn_dup,]
counts2 <-as.data.frame(matrix(NA, length(gn_dup),ncol(tmp)))
colnames(counts2) <-colnames(tmp)
rownames(counts2) <-paste0("ENS_",1:length(gn_dup))
counts2 <-counts2[,-c(1,72)]
for(i in 1:length(gn_dup)){
x=gn_dup[i]
y=tmp[tmp$gene_name %in% x,-c(1,72)]
counts2[i,1:ncol(counts2)] <-apply(y,2,function(x) round(mean(as.numeric(x))))
}
counts2 <-cbind(gene_id=rownames(counts2),counts2,gene_name=gn_dup)
tmp3 <-rbind(tmp1,as.matrix(counts2))
ccounts <-tmp3[,2:(ncol(tmp3)-1)]
ccounts <-t(apply(ccounts,1,as.numeric))
rownames(ccounts) <-tmp3$gene_id
colnames(ccounts) <-colnames(tmp3[,2:(ncol(tmp3)-1)])
dds0 <- DESeqDataSetFromMatrix(countData = ccounts,
colData = colData,
design = ~Batches+Disease)
filter <- rowSums(counts(dds0)>=5) >= 35 #re-check low counts genes
dds0 <- dds0[filter,]
Principal component analysis
vsd <- vst(dds0, blind=FALSE)
pcaData <- plotPCA(vsd, intgroup=c("Batches", "Disease"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=Batches, shape=Disease,label = rownames(pcaData))) +
geom_point(size=3,alpha=1) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()+
ggtitle("All samples after merging replicate Runs")+
theme_bw()

Batch correction
assay(vsd)<-limma::removeBatchEffect(assay(vsd), vsd$Batches)
pcaData <- plotPCA(vsd, intgroup=c("Batches", "Disease"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=Batches, shape=Disease,label = rownames(pcaData))) +
geom_point(size=3,alpha=1) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()+
ggtitle("All samples after merging replicate Runs")+
theme_bw()

Differential expression analysis using DESeq2
dds2<-DESeq(dds0)
resultsNames(dds2)
## [1] "Intercept" "Batches_2_vs_1" "Batches_3_vs_1"
## [4] "Batches_4_vs_1" "Batches_5_vs_1" "Batches_6_vs_1"
## [7] "Batches_7_vs_1" "Batches_8_vs_1" "Batches_9_vs_1"
## [10] "Batches_10_vs_1" "Batches_11_vs_1" "Batches_12_vs_1"
## [13] "Batches_13_vs_1" "Disease_OA_vs_NonOA"
OAvsnonOA<-as.data.frame(results(dds2,name="Disease_OA_vs_NonOA",pAdjustMethod = "fdr"))
OAvsnonOA_dup <-OAvsnonOA[grep("ENS_",rownames(OAvsnonOA)),]
names(gn_dup) <-counts2$gene_id
setdiff(names(gn_dup),rownames(OAvsnonOA_dup)) ## check if some corrected duplicates did not exceed the minimum of counts for at least half of the patients
## [1] "ENS_202"
#[1] "ENS_202"
which(names(gn_dup) %in% "ENS_202")
## [1] 202
# 202
OAvsnonOA_dup <-OAvsnonOA_dup[names(gn_dup[-202]),]
OAvsnonOA_dup$hgnc_symbol <-gn_dup[-202]
OAvsnonOA_dup <-cbind(Row.names=rownames(OAvsnonOA_dup), OAvsnonOA_dup)
OAvsnonOA<-merge(OAvsnonOA,ens2gene,by.x="row.names",by.y="gene_id")
colnames(OAvsnonOA)[8] <-"hgnc_symbol"
OAvsnonOA_dup <-OAvsnonOA_dup[,colnames(OAvsnonOA)]
OAvsnonOA <-rbind(OAvsnonOA, OAvsnonOA_dup)
OAvsnonOA<-OAvsnonOA[order(OAvsnonOA$log2FoldChange,decreasing=TRUE),]
Save files
OAvsnonOA <-OAvsnonOA[complete.cases(OAvsnonOA),]
save(OAvsnonOA, dds0, OAvsnonOA_dup, gn_dup,file=paste0(MEDIAsave,"genide-bulk-unpaired.RData"))
save(OAvsnonOA,file=paste0(MEDIAsave,"genide-bulk-unpaired_DE.RData"))
Volcano Plot of DE genes
annot <-OAvsnonOA[,c(3,7,8)]
annot <-annot[complete.cases(annot),]
annot$col <-"gray50"
annot[annot$log2FoldChange <= -1.5 & annot$padj <=0.05, "col"] <- "springgreen3"
annot[annot$log2FoldChange >= 1.5 & annot$padj <=0.05, "col"] <- "red"
annot$label <- NA
annot[annot$log2FoldChange >= 3 & annot$padj <=0.05, "label"] <- annot[annot$log2FoldChange >= 3 & annot$padj <=0.05, "hgnc_symbol"]
annot[annot$log2FoldChange <= -3 & annot$padj <=0.05, "label"] <- annot[annot$log2FoldChange <= -3 & annot$padj <=0.05, "hgnc_symbol"]
ggplot(data=annot, aes(x=log2FoldChange, y=-log10(padj), color=col,label=label)) +
geom_point(size=1) +
geom_label_repel(size=4,
min.segment.length = 0,direction = "both",
max.overlaps =40,
color="black",
segment.linetype=2) +
scale_shape_manual(values=c(8, 19))+
scale_color_manual(values = c("red","gray50","springgreen3"),
breaks = c("red","gray50","springgreen3"),
labels = c("UP","notDE","DOWN")) +
labs(color="Legend")+
xlab("log2(FC)")+
ylab("-log10(FDR)")+
ggtitle("Volcano plot of DE genes - Dataset 3")+
theme_bw()+
geom_vline(xintercept=c(-2.5, 2.5), col="gray", linetype=2)+
geom_hline(yintercept=-log10(0.05), col="gray", linetype=2)

Enrichment with GSEA: BP and REACTOME
gene <-OAvsnonOA[order(OAvsnonOA$log2FoldChange, decreasing = TRUE),]
genes <-gene$log2FoldChange
names(genes) <-gene$hgnc_symbol
hs=get("org.Hs.eg.db")
gsebp2 <- gseGO(geneList=genes,
ont ="BP",
keyType = "SYMBOL",
pvalueCutoff = 0.01,
eps = 0,
verbose = TRUE,
OrgDb = hs,
pAdjustMethod = "fdr")
dotplot(gsebp2, showCategory = 10, x="NES",split=".sign") + facet_grid(.~.sign)

rc <-msigdbr(species = "Homo sapiens", category = "C2", subcategory = "CP:REACTOME")
sig <-rc[,c(3,4)]
sig$gs_name <-gsub("^REACTOME_","",sig$gs_name)
gsreac2 <- clusterProfiler::GSEA(genes,TERM2GENE = sig,
eps = 0,
verbose = TRUE,
minGSSize =5, pAdjustMethod = "fdr",
pvalueCutoff =0.1)
dotplot(gsreac2, showCategory = 10, x="NES",split=".sign") + facet_grid(.~.sign)+
theme(axis.text.y = element_text(size = 7))
